knitr::opts_chunk$set(echo = TRUE, message = FALSE)
library(Seurat)
library(ggplot2)
library(data.table)
library(MAST)
library(SingleR)
library(dplyr)
library(tidyr)
library(limma)
library(ggrepel)
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.8.2               limma_3.44.3               
##  [3] tidyr_1.1.1                 dplyr_1.0.2                
##  [5] SingleR_1.2.4               MAST_1.14.0                
##  [7] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2
##  [9] DelayedArray_0.14.1         matrixStats_0.56.0         
## [11] Biobase_2.48.0              GenomicRanges_1.40.0       
## [13] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [15] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## [17] data.table_1.13.0           ggplot2_3.3.2              
## [19] Seurat_3.2.0               
## 
## loaded via a namespace (and not attached):
##   [1] AnnotationHub_2.20.1          BiocFileCache_1.12.1         
##   [3] plyr_1.8.6                    igraph_1.2.5                 
##   [5] lazyeval_0.2.2                splines_4.0.2                
##   [7] BiocParallel_1.22.0           listenv_0.8.0                
##   [9] digest_0.6.25                 htmltools_0.5.0              
##  [11] magrittr_1.5                  memoise_1.1.0                
##  [13] tensor_1.5                    cluster_2.1.0                
##  [15] ROCR_1.0-11                   globals_0.12.5               
##  [17] colorspace_1.4-1              blob_1.2.1                   
##  [19] rappdirs_0.3.1                xfun_0.16                    
##  [21] crayon_1.3.4                  RCurl_1.98-1.2               
##  [23] jsonlite_1.7.0                spatstat_1.64-1              
##  [25] spatstat.data_1.4-3           survival_3.2-3               
##  [27] zoo_1.8-8                     ape_5.4-1                    
##  [29] glue_1.4.1                    polyclip_1.10-0              
##  [31] gtable_0.3.0                  zlibbioc_1.34.0              
##  [33] XVector_0.28.0                leiden_0.3.3                 
##  [35] BiocSingular_1.4.0            future.apply_1.6.0           
##  [37] abind_1.4-5                   scales_1.1.1                 
##  [39] DBI_1.1.0                     miniUI_0.1.1.1               
##  [41] Rcpp_1.0.5                    viridisLite_0.3.0            
##  [43] xtable_1.8-4                  reticulate_1.16              
##  [45] bit_4.0.4                     rsvd_1.0.3                   
##  [47] htmlwidgets_1.5.1             httr_1.4.2                   
##  [49] RColorBrewer_1.1-2            ellipsis_0.3.1               
##  [51] ica_1.0-2                     pkgconfig_2.0.3              
##  [53] uwot_0.1.8                    dbplyr_1.4.4                 
##  [55] deldir_0.1-28                 tidyselect_1.1.0             
##  [57] rlang_0.4.7                   reshape2_1.4.4               
##  [59] later_1.1.0.1                 AnnotationDbi_1.50.3         
##  [61] munsell_0.5.0                 BiocVersion_3.11.1           
##  [63] tools_4.0.2                   generics_0.0.2               
##  [65] RSQLite_2.2.0                 ExperimentHub_1.14.1         
##  [67] ggridges_0.5.2                evaluate_0.14                
##  [69] stringr_1.4.0                 fastmap_1.0.1                
##  [71] yaml_2.2.1                    goftest_1.2-2                
##  [73] knitr_1.29                    bit64_4.0.2                  
##  [75] fitdistrplus_1.1-1            purrr_0.3.4                  
##  [77] RANN_2.6.1                    pbapply_1.4-3                
##  [79] future_1.18.0                 nlme_3.1-148                 
##  [81] mime_0.9                      compiler_4.0.2               
##  [83] plotly_4.9.2.1                curl_4.3                     
##  [85] png_0.1-7                     interactiveDisplayBase_1.26.3
##  [87] spatstat.utils_1.17-0         tibble_3.0.3                 
##  [89] stringi_1.4.6                 lattice_0.20-41              
##  [91] Matrix_1.2-18                 vctrs_0.3.2                  
##  [93] pillar_1.4.6                  lifecycle_0.2.0              
##  [95] BiocManager_1.30.10           lmtest_0.9-37                
##  [97] RcppAnnoy_0.0.16              BiocNeighbors_1.6.0          
##  [99] cowplot_1.0.0                 bitops_1.0-6                 
## [101] irlba_2.3.3                   httpuv_1.5.4                 
## [103] patchwork_1.0.1               R6_2.4.1                     
## [105] promises_1.1.1                KernSmooth_2.23-17           
## [107] gridExtra_2.3                 codetools_0.2-16             
## [109] MASS_7.3-52                   assertthat_0.2.1             
## [111] withr_2.2.0                   sctransform_0.2.1            
## [113] GenomeInfoDbData_1.2.3        mgcv_1.8-31                  
## [115] grid_4.0.2                    rpart_4.1-15                 
## [117] rmarkdown_2.3                 DelayedMatrixStats_1.10.1    
## [119] Rtsne_0.15                    shiny_1.5.0

This File

Labeling the Nbeal clusters, to figure out where they are getting moved to in the integrated data. The goal here is to better label the clusters of the integrated dataset with higher confidence

Nbeal Data Only

cnt.data <- Read10X(data.dir = './data/Experiment2/filtered_feature_bc_matrix/')

cnt <- CreateSeuratObject(counts = cnt.data, project = 'Nbeal', min.cells = 3, min.features = 200)

# Getting the HTOs

nbeal_hto <- read.table('./data/Experiment2/hto_labels.txt')

nbeal_hto <- nbeal_hto[nbeal_hto$V2 %in% c('HTO3','HTO4'),]

nbeal_hto$condition <- ifelse(nbeal_hto$V2 == 'HTO3', 'Nbeal_cntrl', 'enrNbeal_cntrl')

nbeal_hto$cell <- paste0(nbeal_hto$V1, '-1')

# summary(nbeal_hto$cell %in% colnames(cnt))

cnt <- cnt[,colnames(cnt) %in% nbeal_hto$cell]

# Making sure the cell order is maintained between the two dataframes, so I can
# just add the condition to the meta data

#summary(rownames(wbm@meta.data) == htos$Barcode)

# Adding the condition to the meta data

cnt@meta.data$Condition <- nbeal_hto$condition

cnt[['percent.mt']] <- PercentageFeatureSet(cnt, pattern = '^mt')

less.cnt <- subset(cnt, subset = nFeature_RNA > 500 & percent.mt < 10)
less.cnt <- NormalizeData(less.cnt, verbose = F)

less.cnt <- FindVariableFeatures(less.cnt,
                                        selection.method = 'vst',
                                        nfeatures = 2000,
                                        verbose = F)

less.cnt <- ScaleData(less.cnt, verbose = F)

less.cnt <- RunPCA(less.cnt, features = VariableFeatures(less.cnt))

ElbowPlot(less.cnt)

# choosing 15 PCs

less.cnt <- FindNeighbors(less.cnt, dims = 1:15)

res <- seq(0,1, by = 0.05)
clstrs <- c()

for (i in res){
        x <- FindClusters(less.cnt, resolution = i, verbose = F)
        clstrs <- c(clstrs, length(unique(x$seurat_clusters)))
        
}
plot(res,clstrs)

# Going with .2 and .7

less.cnt <- FindClusters(less.cnt, resolution = .2, verbose = F)
less.cnt <- FindClusters(less.cnt, resolution = .7, verbose = F)

less.cnt <- RunUMAP(less.cnt, dims = 1:15)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# DimPlot(less.cnt, reduction = 'umap', label = T, repel = T) + NoLegend() + ggtitle ('Resolution 0.7')

DimPlot(less.cnt, reduction = 'umap', label = T, repel = T, group.by = 'RNA_snn_res.0.2') +
        NoLegend() + ggtitle ('Resolution 0.2')

Comparison Between Clustering

paste0(0:14,' = ',c('Gran-1','Gran-2','?GMP','Bcell-1','Gran-3','Monocyte','?MEP/MAST','?CMP','Macrophage',
          'Bcell-2','Erythrocyte','Tcell','MK','Bcell-3','Bcell-4'))
##  [1] "0 = Gran-1"       "1 = Gran-2"       "2 = ?GMP"         "3 = Bcell-1"     
##  [5] "4 = Gran-3"       "5 = Monocyte"     "6 = ?MEP/MAST"    "7 = ?CMP"        
##  [9] "8 = Macrophage"   "9 = Bcell-2"      "10 = Erythrocyte" "11 = Tcell"      
## [13] "12 = MK"          "13 = Bcell-3"     "14 = Bcell-4"

Going to go with clustering resolution 0.2, which is what is displayed above. Of key is cluster 3 which becomes clusters 6 (MEP/Mast), 7 (CMP), part of 10 (Erythrocytes) and 12 (MKs); in the integrated dataset, see v2.1.3 Cluster Centroid for more details.

Nbeal Data in Integrated Data Set

# Calling the Seurat variable wbm instead of comb.int which is what it was previously

wbm <- readRDS('./data/v2/lesser.combined.integrated.rds')

wbm$State <- wbm$Condition

wbm$Condition <- ifelse(grepl('enr', wbm$Condition), 'Enriched', 'Not enriched')

wbm$Experiment <- ifelse(grepl('Mpl', wbm$State), 'Mpl',
                         ifelse(grepl('Migr', wbm$State), 'Migr1', 'Control'))

sumry <- read.table('./data/v2/summary_naming.tsv', header = T, sep = '\t')
# sumry
# sumry2 <- sumry
# sumry$final2 <- sumry$final
# sumry$final2[c(3,8)] <- c('?GMP','?CMP')
# write.table(sumry,'./data/v2/summary_naming.tsv', quote = F, row.names = F, sep = '\t')

lbls <- c('Gran-1','Gran-2','?GMP','Bcell-1','Gran-3','Monocyte','?MEP/MAST','?CMP','Macrophage',
          'Bcell-2','Erythrocyte','Tcell','MK','Bcell-3','Bcell-4')

DimPlot(wbm, reduction = 'umap', label = T, repel = T)

new_levels <- lbls

names(new_levels)  <- levels(wbm)
#new_levels
wbm <- RenameIdents(wbm, new_levels)
wbm$new_cluster_IDs <- Idents(wbm)

# DimPlot(wbm, reduction = 'umap', label = T, repel = T) + NoLegend()
# 
# DimPlot(wbm, reduction = 'umap', split.by = 'Condition') + NoLegend()
# 
# DimPlot(wbm, reduction = 'umap', split.by = 'Experiment') + NoLegend()
# 
# DimPlot(wbm, reduction = 'umap', split.by = 'State', ncol = 3) + NoLegend()

Now I am going to do a similar labeling as in file v2.2.Labeling, which is what was used to generate the cell types shown, but just in the NBeal-control cells.

nbeal <- subset(wbm, Experiment == 'Control')

DimPlot(nbeal, reduction = 'umap', label = T, repel = T)

table(nbeal$new_cluster_IDs)
## 
##      Gran-1      Gran-2        ?GMP     Bcell-1      Gran-3    Monocyte 
##         403         335         149         236         167         148 
##   ?MEP/MAST        ?CMP  Macrophage     Bcell-2 Erythrocyte       Tcell 
##          18          26          33         103          35          47 
##          MK     Bcell-3     Bcell-4 
##         131          84          33
DefaultAssay(nbeal) <- 'RNA'

SingleR

# Loading up reference datasets
m.ref.immgen <- ImmGenData()
m.ref.mus <- MouseRNAseqData()

ref <- list(m.ref.immgen, m.ref.mus)
ref.label <- list(m.ref.immgen$label.main, m.ref.mus$label.main)

# Creating a sc experiment from our seurat object

SCnbeal <- as.SingleCellExperiment(nbeal)

# Predicting the Cluster label
pred_cluster <- SingleR(test = SCnbeal,
                        ref = ref,
                        labels = ref.label,
                        method = 'cluster',
                        clusters = SCnbeal$new_cluster_IDs)

# Predicting individual cell labels
pred_cell <- SingleR(test = SCnbeal,
                     ref = ref,
                     labels = ref.label,
                     method = 'single')

Cluster Labels

# pred_cluster$scores
# 
# pred_cluster$labels
# 
# pred_cluster$pruned.labels
# 
# pred_cluster$orig.results

pred_scores_cluster <- pred_cluster$scores

# Deleting columns without any values

pred_scores_cluster <- as.data.frame(pred_scores_cluster[,colSums(is.na(pred_scores_cluster)) != nrow(pred_scores_cluster)])

rownames(pred_scores_cluster) <- lbls

colnames(pred_scores_cluster)[7:12] <- paste0(colnames(pred_scores_cluster)[7:12],'-2')

pred_scores_cluster <- gather(pred_scores_cluster, Cell.Type, Score, factor_key = T)

pred_scores_cluster$seurat.cluster <- rep(lbls, 12)

pred_scores_cluster$ref <- ifelse(is.na(tstrsplit(pred_scores_cluster$Cell.Type,'-')[[2]]), 'ref1','ref2')

pred_scores_cluster$seurat.cluster2 <- as.factor(pred_scores_cluster$seurat.cluster)

pred_scores_cluster$score2 <- ifelse(is.na(pred_scores_cluster$Score), 0, pred_scores_cluster$Score)



ggplot(data = pred_scores_cluster, aes(y = Cell.Type, x = seurat.cluster2, 
                                       fill = ref, alpha = score2)) +
        geom_tile() +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95)) +
        scale_fill_manual(values = c('red','blue'))

For the most part what we would expect and aligns with the labels generated from the integrated analysis (x-axis labels).

# pulling out the information we need
pred_cell_score <- pred_cell[,c('pruned.labels','reference')]

# Adding the seurat cluster to the cell
#summary(rownames(pred_cell_score) == rownames(wbm@meta.data))
pred_cell_score$cluster <- nbeal$new_cluster_IDs

cell_score <- as.data.frame(table(paste0(pred_cell_score$pruned.labels,'-',
                                         pred_cell_score$reference),
                                  pred_cell_score$cluster))

colnames(cell_score) <- c('Cell Type','Cluster','Count')

# ggplot(cell_score[cell_score$Cluster == 'MK',], aes(x = Cluster, y = Count, fill = `Cell Type`)) + 
#         geom_bar(stat = 'identity', position = position_dodge()) +
#         theme_bw() +
#         geom_text(stat = 'identity', aes(label = Count),
#                   position = position_dodge(width = .9),
#                   vjust = -.1, size = 2.5)

cell_score$cluster_count <- NA

for (i in unique(cell_score$Cluster)){
        cell_score[cell_score$Cluster ==i,]$cluster_count <- 
                sum(cell_score[cell_score$Cluster ==i,]$Count)
        
}

cell_score$count_per <- round(cell_score$Count/cell_score$cluster_count,2)*100

# ggplot(cell_score, aes (x = Cluster, y = count_per, fill = `Cell Type`)) +
#         geom_bar(stat = 'identity', position = position_dodge()) +
#         theme_bw() +
#         geom_text(stat = 'identity', aes(label = count_per),
#                   position = position_dodge(width = .9),
#                   vjust = -.1, size = 2.5)

ggplot(cell_score, aes(x = Cluster, y = `Cell Type`, fill = count_per)) +
        geom_tile() +
        scale_fill_gradient2(low = 'white', mid = 'red', high = 'darkred',
                             midpoint = 50)

cell_score$ref <- grepl('1',cell_score$`Cell Type`)
cell_score$ref <- ifelse(cell_score$ref == T, 'ref1','ref2')

ggplot(cell_score, aes(x = Cluster, y = `Cell Type`, alpha = count_per,
                       fill = ref)) +
        scale_fill_gradient2(low = 'white', mid = 'red', high = 'darkred',
                             midpoint = 50) +
        geom_tile() +
        scale_fill_manual(values = c('red','blue')) + 
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

Similar results once again. Still curious is why the ?MEP/MAST relate so highly to basophils.

Marker Gene Expression

Cell type specific marker gene expression. Genes were added to the list in two different ways: canonical markers that are well known in the field, and genes that distinguished clusters and were found to play a key role in specific cells.

Literature for Markers: Previously used

Ighd: immunoglobulin heavy constant delta. Seems to clearly be expressed by B-cells, but still working on a good reference.

Gata2: From Krause paper: a transcription factor required for both lineages but bind in different combinations ref

Cd68: a human macrophage marker ref. A more general ref

Vcam1: found papers using Vcam1+ monocytes, but haven’t found a great reference.

Alas2: an erythroid-specfiic 5-aminolevulinate synthase gene ref

Gata3: plays a role in the regulation of T-cells ref

Vwf and Itga2b: I figure the reference would best be left to y’all.

New Markers

Ly6g: from website it plays a role in monocyte, granulocyte, and neutrophil

Ngp: from uniprot “Expressed in myeloid bone marrow cells. Expressed in neutrophilic precursors (at protein level) (PubMed:8749713). Expressed in myeloid bone marrow cells (PubMed:21518852)”

Mmp8: neutrophil/lymphocyte collagenase link

marker.genes <- rev(c('Itga2b','Vwf','Gata3','Alas2','Vcam1','Cd68','Gata2','Ighd'))

DotPlot(wbm, features = marker.genes) +
        ylab ('Cell Cluster') + xlab ('Marker Genes') +
        theme(text = element_text(size = 10, family = 'sans'),
              axis.text.x = element_text(angle = 45, 
                                         vjust = .5, size = 10, family = 'sans'),
              axis.text.y = element_text(family = 'sans', size = 10),
              axis.title = element_text(family = 'sans', size = 12))

otros.marker.genes <- rev(c('Cebpe','Fcnb'))


DotPlot(wbm, features = otros.marker.genes) +
        ylab ('Cell Cluster') + xlab ('Marker Genes') +
        theme(text = element_text(size = 10, family = 'sans'),
              axis.text.x = element_text(angle = 45, 
                                         vjust = .5, size = 10, family = 'sans'),
              axis.text.y = element_text(family = 'sans', size = 10),
              axis.title = element_text(family = 'sans', size = 12))

new.markers <- c('Mcpt8','Prss34','Kit','Jchain','Hmgb1', 'Vpreb3','Igkc','Ighm')

hspc.markers <- c('SCA-1', 'Cd38','Thy1','Kit')

DotPlot(wbm, features = hspc.markers) +
        ylab ('Cell Cluster') + xlab ('Marker Genes') +
        theme(text = element_text(size = 10, family = 'sans'),
              axis.text.x = element_text(angle = 45, 
                                         vjust = .5, size = 10, family = 'sans'),
              axis.text.y = element_text(family = 'sans', size = 10),
              axis.title = element_text(family = 'sans', size = 12))
## Warning: Could not find Cd38 in the default search locations, found in RNA assay
## instead
## Warning in FetchData(object = object, vars = features, cells = cells): The
## following requested variables were not found: SCA-1

DotPlot(wbm, features = c(otros.marker.genes,new.markers,marker.genes)) +
        ylab ('Cell Cluster') + xlab ('Marker Genes') +
        theme(text = element_text(size = 10, family = 'sans'),
              axis.text.x = element_text(angle = 45, 
                                         vjust = .5, size = 10, family = 'sans'),
              axis.text.y = element_text(family = 'sans', size = 10),
              axis.title = element_text(family = 'sans', size = 12))

It seems that the lymphoid cells, monocyte and macrophage clusters are easily identifiable but the other questions have remaining questions.

  • What type of granulocyte are gran-1 and gran-2?

  • Are ?GMP and ?CMP truly a progenitor state

  • Are there better markers for Erythrocytes

  • Are the hspcs mixed in with MKs? Kit is of importance when labeling HSPCs but that MK cluster highest in Kit also is the only cluster somewhat widely expressing Vwf.

  • More specific markers for different stages of CMPs to helpfully clear some things up.

  • Why is there a cluster expressing both MEP and Mast cell markers so strongly?

MORE WORK

Human WBM Comparison

hwbm_ex <-Read10X(data.dir = './data/hum_ref_wbm/GSE120221_RAW/GSM3396161/')

hwbm <- CreateSeuratObject(counts = hwbm_ex, project = 'hwbm', min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
hwbm_cell_labels <- read.csv('./data/hum_ref_wbm/celltype.csv')

#hwbm_cell_labels

hwbm_cell_labels$cell <- tstrsplit(hwbm_cell_labels$X, '_', keep = 2)[[1]]
hwbm_cell_labels$exp <- tstrsplit(hwbm_cell_labels$X, "_", keep = 1)[[1]]

hwbm_cell_labels <- hwbm_cell_labels[hwbm_cell_labels$exp == 'A',]
hwbm_cell_labels$cell <- paste0(hwbm_cell_labels$cell, '-1')

hwbm[['percent.mt']] <- PercentageFeatureSet(hwbm, pattern = "^MT-")

cells_to_keep <- colnames(hwbm)[colnames(hwbm) %in% hwbm_cell_labels$cell]

hwbm <- subset(hwbm, cells = cells_to_keep)

genes_to_keep <- rownames(hwbm)[rownames(hwbm) %in% rownames(wbm)]

hwbm <- subset(hwbm, features = genes_to_keep)

hwbm <- NormalizeData(hwbm, normalization.method = 'LogNormalize', scale.factor = 10000)

hwbm <- ScaleData(hwbm, features = rownames(hwbm))

nbeal2 <- subset(nbeal, features = genes_to_keep)
#summary(hwbm_cell_labels$cell == rownames(hwbm@meta.data))

hwbm@meta.data$cell_id <- hwbm_cell_labels$type

Idents(hwbm) <- hwbm$cell_id

av_wbm <- AverageExpression(nbeal2)$RNA

av_hwbm <- AverageExpression(hwbm)$RNA

av <- cbind(av_wbm, av_hwbm)

av_cor <- cor(av, method = 'kendall')

av_cor2 <- as.data.frame(av_cor)

av_cor2$row <- rownames(av_cor2)

colnames(av_cor2)[1:15] <- paste0('Cluster',0:14)
rownames(av_cor2)[1:15] <- paste0('Cluster',0:14)
av_cor2$row <- rownames(av_cor2)

# gather(av_cor2, row, cor, Cluster0:Cluster14, factor_key = T)

av_cor2 <- reshape(av_cor2, direction = 'long', 
        varying = list(names(av_cor2)[1:34]),
        v.names = 'Correlation',
        idvar = c('row'),
        timevar = 'CT2',
        times = names(av_cor2)[1:34])

av_cor2$row <- factor(av_cor2$row, levels = unique(av_cor2$row))
av_cor2$CT2 <- factor(av_cor2$CT2, levels = unique(av_cor2$CT2))

ggplot(av_cor2, aes(x = row, y = CT2, fill = Correlation)) +
        geom_tile() +
        theme_bw() + 
        scale_fill_gradient2(high = 'darkred', low = 'white', mid = 'red',,
                             midpoint = 0.5, limit = c(0,1)) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

lvls <- levels(av_cor2$row)
av_cor3 <- av_cor2[av_cor2$row %in% lvls[1:15] & av_cor2$CT2 %in% lvls[16:34],]
ggplot(av_cor3, aes(x = row, y = CT2, fill = Correlation)) +
        geom_tile() +
        theme_bw() + 
        scale_fill_gradient2(high = 'darkred', low = 'white', mid = 'red',,
                             midpoint = 0.5, limit = c(0,1)) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

av_cor3$row_labels <- rep(lbls,19)

ggplot(av_cor3, aes(x = row_labels, y = CT2, fill = Correlation)) +
        geom_tile() +
        theme_bw() + 
        scale_fill_gradient2(high = 'darkred', low = 'white', mid = 'red',,
                             midpoint = 0.5, limit = c(0,1)) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

Granulocytes

Of interest are the many granulocyte populations and how we distinguish these from each other (if we even want to). From the above information it seems that both GMPs and CMPs share many characteristics with stem cells (ie HSPCs). We don’t think these are truly HSPCs but instead a downstream progenitor of HSPCs.

These clusters (CMPs, GMPs) also express many markers of Neutrophils very highly, whereas the Gran 1, 2, & 3 don’t express these in the same amount.

Going to specifically look at these markers (some from above):

Ly6g: from website it plays a role in monocyte, granulocyte, and neutrophil

Ngp: from uniprot “Expressed in myeloid bone marrow cells. Expressed in neutrophilic precursors (at protein level) (PubMed:8749713). Expressed in myeloid bone marrow cells (PubMed:21518852)”

Mmp8: neutrophil/lymphocyte collagenase link

Fcnb: identified as belonging tot he myeloid cell lineage by magnetic sorting by subsequent RT-PCR in BM cells link

Cebpe: encoded protein may be essential for terminal differentiation and functional maturation of committed granulocyte progenitor cells.

From Panglaodb

For Basophils: Hist1h1d, Nt5c3, Npl, Nfil3, Mboat1, Lama5, L1cam, Inf2, Ifitm1, Hist1h2ac

For Eosinophils: Ear2, Siglecf, Il5ra, Csf2, Ccl5, Ccl11, Cpa3, Prg3, Il5

For Neutrophils: Itgax, Ccrl2, Il1r2, Mrgpra2b, Bst1, Arg2, Sorl1, Trem1, Ly6g, Ncf1, Ccr1, Cd177, Trem3, Mmp9

gran.markers <- c('Ly6g','Ngp', 'Mmp8','Fcnb','Cebpe')

baso.markers <- c('Hist1h1d', 'Nt5c3', 'Npl', 'Nfil3', 'Mboat1', 'Lama5', 'L1cam', 'Inf2', 'Ifitm1', 'Hist1h2ac')

eos.markers <- c('Ear2', 'Siglecf', 'Il5ra', 'Csf2', 'Ccl5', 'Ccl11', 'Cpa3', 'Prg3', 'Il5')

neut.markers <- c('Itgax', 'Ccrl2', 'Il1r2', 'Mrgpra2b', 'Bst1', 'Arg2', 'Sorl1', 'Trem1', 'Ly6g', 'Ncf1', 'Ccr1', 'Cd177', 'Trem3', 'Mmp9')
nbeal.gran <- subset(nbeal, new_cluster_IDs %in% c(paste0('Gran-',1:3), '?CMP','?GMP'))

DotPlot(nbeal, features = gran.markers) + ggtitle ('Granulocyte Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

DotPlot(nbeal.gran, features = gran.markers) + ggtitle ('Granulocyte Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

DotPlot(nbeal, features = baso.markers) + ggtitle('Basophil Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

DotPlot(nbeal.gran, features = baso.markers) + ggtitle('Basophil Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

DotPlot(nbeal, features = neut.markers) + ggtitle('Neutrophil Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

DotPlot(nbeal.gran, features = neut.markers) + ggtitle('Neutrophil Markers') +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

CMP Cluster

Looking specfically at the CMP cluster, since it didn’t show as much expression of different granulocyte markers. This isn’t that surprising since it is a more distant relative to granulocytes.

# Comparing to all other cell types
cmp.markers.all <- FindMarkers(nbeal,
                               ident.1 = '?CMP',
                               logfc.threshold = log(2),
                               test.use = 'MAST')

# Comparing to granulocytes
cmp.markers.gran <- FindMarkers(nbeal.gran,
                                ident.1 = '?CMP',
                                logfc.threshold = log(2),
                                test.use = 'MAST')

# Cleaning up the lists

cmp.markers.all <- cmp.markers.all[cmp.markers.all$p_val_adj < 0.05,]
cmp.markers.all <- cmp.markers.all[order(cmp.markers.all$avg_logFC, decreasing = T),]

cmp.markers.gran <- cmp.markers.gran[cmp.markers.gran$p_val_adj < 0.05,]
cmp.markers.gran <- cmp.markers.gran[order(cmp.markers.gran$avg_logFC, decreasing = T),]

dim(cmp.markers.all)
## [1] 373   5
dim(cmp.markers.gran)
## [1] 787   5
# Including only upregulated genes

cmp.markers.all <- cmp.markers.all[cmp.markers.all$avg_logFC > 0,]
cmp.markers.gran <- cmp.markers.gran[cmp.markers.gran$avg_logFC > 0,]

dim(cmp.markers.all)
## [1] 219   5
dim(cmp.markers.gran)
## [1] 519   5
head(cmp.markers.gran)
##               p_val avg_logFC pct.1 pct.2     p_val_adj
## Elane  3.841772e-41  3.902178 0.885 0.214  6.957834e-37
## Prtn3  3.905288e-68  3.842342 1.000 0.283  7.072868e-64
## Mpo    1.844876e-75  3.734120 0.923 0.182  3.341254e-71
## Ctsg   8.912234e-55  3.235033 0.923 0.101  1.614095e-50
## Rpl12 1.450643e-116  2.960133 1.000 0.517 2.627259e-112
## Rps2  1.304510e-124  2.948185 1.000 0.720 2.362598e-120

Looking at progenitors

Alt Text

HSPCs

Reading through this paper it states that in mice all long-term HSCs are Hoxb5+

VlnPlot(nbeal, features = 'Hoxb5')

FeaturePlot(nbeal, features = 'Hoxb5', reduction = 'umap') 

FeaturePlot(nbeal, features = 'Kit', reduction = 'umap')

There seems to only be some slight expression of this gene in MKs, with a couple in Macrophages.

They also use a few genes to identify different multipotent progenitors (ie short-term HSPCs): Kit, Sca1, Flk2, CD34 and Slamf1.

hsc.genes <- c('Kit', 'Flt3', 'Ly6a', 'Cd34', 'Slamf1')
#hsc.genes %in% rownames(nbeal)

for (i in hsc.genes){
        print(VlnPlot(nbeal, i))
}

It seems like the MK cluster may actually be st-HSPCs.

They also use a few genes to identify different oligopotent progenitors: Kit, Flk2, IL7Ra, CD27, CD34 and FcgR.

prog.genes <- c('Kit', 'Flt3', 'Il7', 'Ly6a','Cd27', 'Cd34', 'Fcgr2b')
# prog.genes %in% rownames(nbeal)
# 
# rownames(nbeal)[grep('Fcgr',rownames(nbeal))]

for(i in prog.genes){
        print(VlnPlot(nbeal, i))
}

First I’m going to look at CMPs, which are:

Conclusion: the identification of CMPs seems pretty spot on

Next going to look for MEPs:

Conclusion: the identification of of MEPs seems uncertain. MEP/MAST match up some with both Kit and Sca but not Cd34 and FcgR. Perhaps there is a subset of MEPs within MEP/MAST.

Finally going to look at GMPs:

Conclusion: doesn’t seem like GMP is the best label and perhaps GMPs are a subset of the CMPs.

Subcluster of CMPs

cmps <- subset(nbeal, new_cluster_IDs == '?CMP')
dim(cmps)
## [1] 18111    26

Twenty-six seems like too small of a number for useful subclustering so going to look at all CMPs.

cmps <- subset(wbm, new_cluster_IDs == '?CMP')

dim(cmps)
## [1] 2000  355
table(cmps$Experiment)
## 
## Control   Migr1     Mpl 
##      26      92     237

What Distinguishes Granulocyte Clusters

Looking at what genes distinguish granulocyte clusters from one another (GMP, Gran 1-3)

MK Look

So when looking at markers for HSPCs it seems like the MK cluster may actually be short-term HSPCs (st-HSPCs), but it is also the cluster that shows expression of intracellular megakryocyte-associated genes in myelofibrosis paper.

mk.genes <- c('Pf4','Vwf','Itga2b','Mpig6b','Selp')

for (i in mk.genes){
        print(VlnPlot(nbeal, features = i))
}

It’s interesting but I would stick with these genes being MKs instead of HSPCs. Also an interesting paper discussing how Kit plays a role in MKs, so perhaps not surprising we do see Kit expressed in our MKs (when of the key genes to identiy st-HSPCs)

Going to look at splitting up the MKs based on the UMAP projection to see if we can see a split between MKs and HSCs.

mks <- subset(nbeal, new_cluster_IDs == 'MK')
#dim(mks)
DimPlot(mks, reduction = 'umap')

mks$UMAP1 <- as.data.frame(mks[['umap']]@cell.embeddings)$UMAP_1
mks$grp <- ifelse(mks$UMAP1 < 8, 1,2)
DimPlot(mks, reduction = 'umap', group.by = 'grp')

for (i in hsc.genes){
        print(VlnPlot(mks, features = i, group.by = 'grp'))
}

for (i in mk.genes){
        print(VlnPlot(mks, features = i, group.by = 'grp'))
}

Seems like group 2 may be more like HSCs, and group 1 seems to be more similar to MKs. I think it could make sense to split these into HSPCs and MKs.

MEP Introspection

This evidence supports that this is actually a mast cell progenitor (MCP), but may also contain other progenitors. Of interest are there cells showing different types of progenitors.

We see only 18 cells in the MEP/MAST cluster, not enough to truly subcluster by…may decide to look look at all wbm for this one.

nbeal.mep.mast <- subset(nbeal, new_cluster_IDs == '?MEP/MAST')
dim(nbeal.mep.mast)
## [1] 18111    18
DimPlot(nbeal.mep.mast, reduction = 'umap')

Instead here it makes sense to use all cells to help identify this cluster.

mast.mep <- subset(wbm, new_cluster_IDs == '?MEP/MAST')
DefaultAssay(mast.mep) <- 'RNA'

DimPlot(mast.mep, reduction = 'umap')

table(mast.mep$Experiment)
## 
## Control   Migr1     Mpl 
##      18      23     551
table(mast.mep$State)
## 
##       enrMigr1         enrMpl enrNbeal_cntrl          Migr1            Mpl 
##             12            500             10             11             51 
##    Nbeal_cntrl 
##              8

So we were using the control (Nbeal) to identify this cluster, but it is really the Mpl experiment where we see most of these cells and the enriched state at that.

DE Genes

mast.mep.markers <- FindMarkers(wbm,
                                ident.1 = '?MEP/MAST',
                                logfc.threshold = log(2),
                                test.use = 'MAST')
mast.mep.markers <- mast.mep.markers[mast.mep.markers$p_val_adj < 0.05,]
mast.mep.markers <- mast.mep.markers[order(mast.mep.markers$avg_logFC, decreasing = T),]

head(mast.mep.markers)
##        p_val avg_logFC pct.1 pct.2 p_val_adj
## Mcpt8      0  4.979810 0.990 0.374         0
## Prss34     0  4.306974 0.958 0.231         0
## Ccl4       0  3.566765 0.892 0.271         0
## Ccl3       0  3.296363 0.973 0.284         0
## Ccl9       0  3.149800 0.980 0.289         0
## Ccl6       0  3.129451 0.970 0.654         0
for (i in rownames(mast.mep.markers)[1:2]){
        print(VlnPlot(wbm, features = i, pt.size = 0))
}

Top DE Genes: * Mcpt8: from uniprot, mast cell protease 8 * Prss34: from uniprot, mast cell protease-11. Interesting paper from 2009 titles basophils preferentially express mouse Mast Cell Protease 11 among the mast cell tryptase family in contrast to mast cells.

MEP Cell Surface Markers

Looking at the same cell surface markers I check before in the NBeal control, but also looking at the Migr1 and Mpl clusters as well.

We want to see: Kit+Ly6a-Cd34-Fcgr2b-

mep.prog.genes <- c('Kit','Ly6a','Cd34','Fcgr2b')
for (i in mep.prog.genes){
        print(VlnPlot(mast.mep, features = i, split.by = 'Experiment'))
}

Things don’t seem any clearer when adding in the extra cells. Still don’t clearly follow the pattern we would expect. The control cells actually follow a GMP pattern: Kit+Ly6a-Cd34+Fcgr2b+

Krause Paper MEP Genes

In this paper they compare, common myeloid progenitors (CMPs), megakaryocyte-erthrocyte progenitors (MEPs), megakaryocyte progenitors (MKPs) and erythrocyte progenitors (ERPs). Thist study was done on human cells, and I will change the gene names to the mouse counterpart.

Genes used in the study:

  • Klf1 and Fli1 play antagonizing roles between MK and E lineages.
  • Gata-1, Gata-2, Runx1, and Tal1 are all transcription factors that are required for both lineages.
  • Myb can toggle MEP fates

Most of the other genes they show in the study were used to differentiate between the different progenitors.

Looking in all clusters

mep.genes <- c('Klf1','Fli1','Gata1','Gata2','Runx1','Tal1','Myb')

# mep.genes %in% rownames(mast.mep)
# rownames(mast.mep)[grepl('Gata', rownames(mast.mep), ignore.case = T)]

for (i in mep.genes){
        print(VlnPlot(wbm, features = i, pt.size = 0))
}
## Warning: Could not find Fli1 in the default search locations, found in RNA assay
## instead

From this is does seem like the ?MEP/MAST cluster looks like it could be MEPs.

Subclustering of MEP/MAST

DefaultAssay(mast.mep) <- 'RNA'
mast.mep <- NormalizeData(mast.mep, normalization.method = 'LogNormalize', scale.factor = 10000)

mast.mep <- FindVariableFeatures(mast.mep, selection.method = 'vst', nfeatures = 2000)

mast.mep <- ScaleData(mast.mep, features = rownames(mast.mep))

mast.mep <- RunPCA(mast.mep, features = VariableFeatures(mast.mep))

DimPlot(mast.mep, reduction = 'pca') + ggtitle("PCA Plot of MAST/MEP Subcluster")

DimPlot(mast.mep, reduction = 'pca', group.by = 'Condition')

DimPlot(mast.mep, reduction = 'pca', group.by = 'Experiment')

#ElbowPlot(mast.mep) # going with the classic 10

mast.mep <- FindNeighbors(mast.mep, dims = 1:10)
mast.mep <- FindClusters(mast.mep, resolution = .5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 592
## Number of edges: 17867
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7520
## Number of communities: 6
## Elapsed time: 0 seconds
mast.mep <- RunUMAP(mast.mep, dims = 1:10)

DimPlot(mast.mep, reduction = 'umap')

DimPlot(mast.mep, reduction = 'umap', group.by = 'Experiment')

Going to look at the MEP cell surface markers, the genes from the Krause paper in the subclusterings, and the mast cell proteases that identify this cluster; in the subclusters.

for (i in rownames(mast.mep.markers)[1:2]){
        print(VlnPlot(mast.mep, features = i, pt.size = 0))
}

The mast cell proteases are expressed highly in all subclusters.

for (i in mep.prog.genes){
        print(VlnPlot(mast.mep, features = i, pt.size = 0))
}

No clear expression following the MEP pattern (Kit+Ly6a-Cd34-Fcgr2b-), but that isn’t surprising since we see little to no expression of Kit.

for (i in mep.genes){
        print(VlnPlot(mast.mep, features = i, pt.size = 0))
}

The only gene that really distiniguished between clusters in Myb, which is used to toggle MEP fates between MKs and Erys. So it doesn’t seem like there is specifically a MEP cluster.

SingleR on Combined Data

Want to look at what this cluster correlates to when also using the Mpl (which makes up the majority of this cluster) and Migr1.

# Done earlier
# # Loading up reference datasets
# m.ref.immgen <- ImmGenData()
# m.ref.mus <- MouseRNAseqData()
# 
# ref <- list(m.ref.immgen, m.ref.mus)
# ref.label <- list(m.ref.immgen$label.main, m.ref.mus$label.main)

# Creating a sc experiment from our seurat object

SCmast.mep <- as.SingleCellExperiment(mast.mep)

# Predicting the sub cluster label
pred_clusters <- SingleR(test = SCmast.mep,
                        ref = ref,
                        labels = ref.label,
                        method = 'cluster',
                        clusters = SCmast.mep$seurat_clusters)

# Predicting whole cluster label
pred_whole_cluster <- SingleR(test = SCmast.mep,
                        ref = ref,
                        labels = ref.label,
                        method = 'cluster',
                        clusters = SCmast.mep$new_cluster_IDs)

# Predicting individual cell labels
pred_cell <- SingleR(test = SCmast.mep,
                     ref = ref,
                     labels = ref.label,
                     method = 'single')

Whole Cluster

pred_whole_cluster$scores[!is.na(pred_whole_cluster$scores)]
## [1] 0.6662644 0.2236751

The highest score from the first reference was basophils at 0.67 and from the second reference was 0.22 from granulocytes.

Subclusters

pred_scores_cluster <- pred_clusters$scores

# Deleting columns without any values

pred_scores_cluster <- as.data.frame(pred_scores_cluster[,colSums(is.na(pred_scores_cluster)) != nrow(pred_scores_cluster)])

head(pred_scores_cluster)
##   Basophils Granulocytes
## 1 0.6748228    0.2143338
## 2 0.6652487    0.2052108
## 3 0.6390680    0.2158268
## 4 0.6620739    0.2165743
## 5 0.6704810    0.2110923
## 6 0.3274409    0.5562877
pred_scores_cluster <- gather(pred_scores_cluster, Cell.Type, Score, factor_key = T)

pred_scores_cluster$seurat.cluster <- rep(0:5, 2)

pred_scores_cluster$seurat.cluster2 <- as.factor(pred_scores_cluster$seurat.cluster)

pred_scores_cluster$score2 <- ifelse(is.na(pred_scores_cluster$Score), 0, pred_scores_cluster$Score)

pred_scores_cluster$Cell.Type <- rep(c('Basophils-1','Granulocytes-2'), each = 6)

pred_scores_cluster$ref <- rep(c('1','2'), each = 6)

ggplot(data = pred_scores_cluster, aes(y = Cell.Type, x = seurat.cluster2, 
                                       fill = ref, alpha = score2)) +
        geom_tile() +
        theme(axis.text.x = element_text(angle = 90, vjust = .2, hjust = .95))

        #scale_fill_gradient2(low = 'white', mid = 'red', high = 'darkred', midpoint = .45)

Individual Cell Labels

table(pred_cell$pruned.labels)
## 
##    Basophils  Eosinophils Granulocytes   Mast cells   Stem cells 
##          487            5           16            6           29

We see the vast majority being basophils, with some mast cell, stem cells and other granulocytes.

Conclusions

The cell surface markers don’t show this cluster to follow the typical MEP pattern, but gene expression and cell surface markers don’t always correlate exactly. Many of the Krause genes show high expression,indicating this may be an MEP cluster.

The SingleR data while looking at just Nbeal cells, indicdates that this is a stem cell cluster. The SingleR datasets are missing progenitor states (MEP, CMP, GMP) and some subtypes (MKs), and it seems more likely that this is a cluster of progenitors.

Also looked at the SingleR for the combined data to increase the sample size. They conclusion from this data would be that this cell population in basophils.

Of interest here are two papers (1 2) that talk about the similarity between basophils and mast cells, and a common progenitor between the two. Below is an interesting figure from the first paper.

New Differentiation Pathway

VlnPlot(mast.mep, features = c('Gata1','Gata2'), pt.size = 0)

But we do see high expression of both Gata1 and Gata2 in all subclusters which would also indicate that these are being moved to MEPs and not towards BasMCPs.

So the conclusion for me is that there is conflicting evidence both for and agains MEPs and BasMCPs.

Final Table

#Idents(wbm)

wbm$new_cluster_IDs2 <- wbm$new_cluster_IDs

wbm$umap1 <- as.data.frame(wbm[['umap']]@cell.embeddings)$UMAP_1

wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs != 'MK', as.character(wbm$new_cluster_IDs2),
                               ifelse(wbm$umap1 < 8, 'MK','HSPC'))


wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs != '?CMP', 
                               as.character(wbm$new_cluster_IDs2), 'CMP')

# wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs != '?MEP/MAST', 
#                                as.character(wbm$new_cluster_IDs2), '?MCP')

wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs %in% c(paste0('Bcell-',1:4)), 'B-cell',
                               as.character(wbm$new_cluster_IDs2))

wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs %in% c(paste0('Gran-',1:3)), 'Granulocyte',
                               as.character(wbm$new_cluster_IDs2))

wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs %in% c(paste0('Bcell-',1:4)), 'B-cell',
                               as.character(wbm$new_cluster_IDs2))

wbm$new_cluster_IDs2 <- ifelse(wbm$new_cluster_IDs != '?GMP', 
                               as.character(wbm$new_cluster_IDs2), 'Granulocyte')

#DimPlot(wbm, reduction = 'umap', group.by = 'new_cluster_IDs2')

Idents(wbm) <- wbm$new_cluster_IDs2

DimPlot(wbm, reduction = 'umap', label = T, repel = T) + NoLegend()

MK DE Expression

Question was asked if there were any DE genes between Migr1 and Mpl within MK cells. Previously there were not any but above we split the MKs into both MKs and HSPCs. So below I’m going to do the DE again with the newer MK cluster.

mk.markers <- FindMarkers(wbm,
                          ident.1 = 'Mpl',
                          ident.2 = 'Migr1',
                          group.by = 'Experiment',
                          subset.ident = 'MK',
                          logfc.threshold = log(2),
                          test.use = 'MAST')

mk.markers <- mk.markers[mk.markers$p_val_adj < 0.05,]
mk.markers <- mk.markers[order(mk.markers$avg_logFC, decreasing = T),]

mk.markers
## [1] p_val     avg_logFC pct.1     pct.2     p_val_adj
## <0 rows> (or 0-length row.names)